import json
import os
import gzip

path_openalex = '../'

## Input: 
## 1. Openalex Sources (488 M)
## 2. Openalex Works (413 G)

## Output: 
## 1. OpenAlexID_Year.txt (2 G)
## 2. OpenAlexID_ReferencedPaperIDs.txt (17 G)

path = path_openalex+'openalex-snapshot/data/sources/'
dirs = sorted(os.listdir(path))
## Journals List
J = {}
for dir_ in dirs:
    print(dir_)
    if 'updated_date' in dir_ and '._' not in dir_:
        files = sorted(os.listdir(path+dir_+'/'))
        for file in files:
            if '.gz' in file and '._'not in file:
                # Open the .gz file in read mode
                with gzip.open(path+dir_+'/'+file, 'r') as f:
                    for line in f:
                        line = json.loads(line)
                        if line['type'] == 'journal':
                            J[line['id'][21:]] = line['display_name']
print(len(J))


path = path_openalex+'openalex-snapshot/data/works/'
dirs = sorted(os.listdir(path))
JournalArticle_Year = {}
with open(path_openalex+'OpenAlexID_Year.txt','w') as fy:
    for dir_ in dirs:
        print(dir_)
        if '_updated_date' not in dir_ and 'updated_date' in dir_:
            files = sorted(os.listdir(path+dir_+'/'))
            for file in files:
                if '.gz' in file and '_part' not in file:
                    # Open the .gz file in read mode
                    with gzip.open(path+dir_+'/'+file, 'rb') as f:
                        for line in f:
                            line = json.loads(line)
                            id_ = line['id'][21:]
                            if line and line['publication_year'] and line['authorships'] and line['type'] and line['type']=='article':
                                try:
                                    if 'primary_location' in line and 'source' in line['primary_location'] and\
                                    'id' in line['primary_location']['source']:
                                        j = line['primary_location']['source']['id'][21:]
                                        if j in J:
                                            JournalArticle_Year[id_] = line['publication_year']
                                            fy.write(id_+'\t'+str(line['publication_year'])+'\n')
                                except:
                                    pass

print(len(JournalArticle_Year))


path = path_openalex+'openalex-snapshot/data/works/'
dirs = sorted(os.listdir(path))
with open(path_openalex+'OpenAlexID_ReferencedPaperIDs.txt','w') as fr:
    for dir_ in dirs:
        print(dir_)
        if '_updated_date' not in dir_ and 'updated_date' in dir_:
            files = sorted(os.listdir(path+dir_+'/'))
            for file in files:
                if '.gz' in file and '_part' not in file:
                    # Open the .gz file in read mode
                    with gzip.open(path+dir_+'/'+file, 'rb') as f:
                        for line in f:
                            line = json.loads(line)
                            id_ = line['id'][21:]
                            if id_ in JournalArticle_Year:
                                if line['referenced_works']:
                                    refs = [r[21:] for r in line['referenced_works'] if r[21:] in JournalArticle_Year]
                                    if refs:
                                        fr.write(id_+'\t'+'\t'.join(refs)+'\n')   
                                                                     